In [263]:
import pandas as pd
In [264]:
data = pd.read_csv('stocks.csv')
In [265]:
print(data.head())
  Ticker        Date        Open        High         Low       Close  \
0   AAPL  07-02-2023  150.639999  155.229996  150.639999  154.649994   
1   AAPL  08-02-2023  153.880005  154.580002  151.169998  151.919998   
2   AAPL  09-02-2023  153.779999  154.330002  150.419998  150.869995   
3   AAPL  10-02-2023  149.460007  151.339996  149.220001  151.009995   
4   AAPL  13-02-2023  150.949997  154.259995  150.919998  153.850006   

    Adj Close    Volume  
0  154.414230  83322600  
1  151.688400  64120100  
2  150.639999  56007100  
3  151.009995  57450700  
4  153.850006  62199000  
In [266]:
data=data.dropna()  #Handling missing values
In [436]:
#Correlation Heatmap – Feature Relationships
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.heatmap(data[["Close", "Volume", "Open", "High", "Low"]].corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Feature Correlation Heatmap")
plt.show()
No description has been provided for this image
In [267]:
data.plot.line(y="Close", use_index=True)
Out[267]:
<Axes: >
No description has been provided for this image
In [268]:
data.info() #checking if the delete action is performed, as well as the other column's information
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 248 entries, 0 to 247
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Ticker     248 non-null    object 
 1   Date       248 non-null    object 
 2   Open       248 non-null    float64
 3   High       248 non-null    float64
 4   Low        248 non-null    float64
 5   Close      248 non-null    float64
 6   Adj Close  248 non-null    float64
 7   Volume     248 non-null    int64  
dtypes: float64(5), int64(1), object(2)
memory usage: 15.6+ KB
In [435]:
#Volume vs. Price Movement
fig, ax1 = plt.subplots(figsize=(12,6))

ax1.set_xlabel("Date")
ax1.set_ylabel("Closing Price", color="blue")
ax1.plot(data["Close"], label="Closing Price", color="blue")
ax1.tick_params(axis="y", labelcolor="blue")

ax2 = ax1.twinx()
ax2.set_ylabel("Volume", color="green")
ax2.bar(data.index, data["Volume"], color="green", alpha=0.3)
ax2.tick_params(axis="y", labelcolor="green")

plt.title("Stock Price vs. Trading Volume")
plt.show()
#High volume with price movement = strong trend confirmation.
No description has been provided for this image
In [269]:
data["Tomorrow"]= data["Close"].shift(-1) #Creating a new columnn "Tomorrow" which holds the next day's closing stock price
In [270]:
data
Out[270]:
Ticker Date Open High Low Close Adj Close Volume Tomorrow
0 AAPL 07-02-2023 150.639999 155.229996 150.639999 154.649994 154.414230 83322600 151.919998
1 AAPL 08-02-2023 153.880005 154.580002 151.169998 151.919998 151.688400 64120100 150.869995
2 AAPL 09-02-2023 153.779999 154.330002 150.419998 150.869995 150.639999 56007100 151.009995
3 AAPL 10-02-2023 149.460007 151.339996 149.220001 151.009995 151.009995 57450700 153.850006
4 AAPL 13-02-2023 150.949997 154.259995 150.919998 153.850006 153.850006 62199000 153.199997
... ... ... ... ... ... ... ... ... ...
243 GOOG 01-05-2023 107.720001 108.680000 107.500000 107.709999 107.709999 20926300 105.980003
244 GOOG 02-05-2023 107.660004 107.730003 104.500000 105.980003 105.980003 20343100 106.120003
245 GOOG 03-05-2023 106.220001 108.129997 105.620003 106.120003 106.120003 17116300 105.209999
246 GOOG 04-05-2023 106.160004 106.300003 104.699997 105.209999 105.209999 19780600 106.214996
247 GOOG 05-05-2023 105.320000 106.440002 104.738998 106.214996 106.214996 20705300 NaN

248 rows × 9 columns

In [271]:
data["Target"]=(data["Tomorrow"]>data["Close"]).astype(int)
In [272]:
data
Out[272]:
Ticker Date Open High Low Close Adj Close Volume Tomorrow Target
0 AAPL 07-02-2023 150.639999 155.229996 150.639999 154.649994 154.414230 83322600 151.919998 0
1 AAPL 08-02-2023 153.880005 154.580002 151.169998 151.919998 151.688400 64120100 150.869995 0
2 AAPL 09-02-2023 153.779999 154.330002 150.419998 150.869995 150.639999 56007100 151.009995 1
3 AAPL 10-02-2023 149.460007 151.339996 149.220001 151.009995 151.009995 57450700 153.850006 1
4 AAPL 13-02-2023 150.949997 154.259995 150.919998 153.850006 153.850006 62199000 153.199997 0
... ... ... ... ... ... ... ... ... ... ...
243 GOOG 01-05-2023 107.720001 108.680000 107.500000 107.709999 107.709999 20926300 105.980003 0
244 GOOG 02-05-2023 107.660004 107.730003 104.500000 105.980003 105.980003 20343100 106.120003 1
245 GOOG 03-05-2023 106.220001 108.129997 105.620003 106.120003 106.120003 17116300 105.209999 0
246 GOOG 04-05-2023 106.160004 106.300003 104.699997 105.209999 105.209999 19780600 106.214996 1
247 GOOG 05-05-2023 105.320000 106.440002 104.738998 106.214996 106.214996 20705300 NaN 0

248 rows × 10 columns

In [273]:
data.shape
Out[273]:
(248, 10)
In [344]:
#Training an initial ML model

from sklearn.ensemble import RandomForestClassifier #choosing RandomForestClassifier due to it's accuracy, avoids overfitting better than others, and can pick non-linear tendencies in the data.
model= RandomForestClassifier(n_estimators=185, min_samples_split=100, random_state=1, class_weight="balanced") #creating the model
#"n_estimators" are the no' of individual decision trees we want to train - higher they are, higher the accuracy is. "min_samples_split" this will help us to protect from overfitting. "random_state=1" will help us to get same results all the time we run the model, or model's results will be predictible.

train= data.iloc[ :-100]
test= data.iloc[-100: ]
predictors= ["Open","High","Low","Close","Volume"]
model.fit(train[predictors], train["Target"])
Out[344]:
RandomForestClassifier(class_weight='balanced', min_samples_split=100,
                       n_estimators=185, random_state=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(class_weight='balanced', min_samples_split=100,
                       n_estimators=185, random_state=1)
In [345]:
#Measuring the aaccuracy of the model
from sklearn.metrics import precision_score
preds= model.predict(test[predictors]) #preds is the prediction score
In [346]:
preds= pd.Series(preds, index=test.index)
In [347]:
print(set(preds))
{0, 1}
In [348]:
precision_score(test["Target"], preds)
Out[348]:
0.5205479452054794
In [349]:
#plotting the predictions
combined= pd.concat([test["Target"], preds], axis=1)
In [350]:
combined.plot()
Out[350]:
<Axes: >
No description has been provided for this image
In [351]:
def predict(train, test, predictors, model):
    model.fit(train[predictors], train["Target"])
    preds= model.predict(test[predictors])
    preds= pd.Series(preds, index=test.index, name="Predictions")
    combined=pd.concat([test["Target"], preds], axis=1)
    return combined
In [438]:
#Rolling Volatility – Risk Measurement
data["Volatility"] = data["Close"].pct_change().rolling(window=30).std()
plt.figure(figsize=(12, 6))
plt.plot(data["Volatility"], label="30-Day Rolling Volatility", color="purple")
plt.xlabel("Date")
plt.ylabel("Volatility")
plt.title("Stock Rolling Volatility Over Time")
plt.legend()
plt.show()
No description has been provided for this image
In [372]:
#Building a backtesting system

def backtest(data1, model, predictors, start=50, step=10): #Testing per year, by taking 10 years of data
    all_predictions= [] #Creating a list in which each data frame stores the predicted value, for a year
    for i in range(start, data1.shape[0], step):
        train= data1.iloc[:i].copy() #Contains all years prior to the current year
        test= data1.iloc[i:(i+step)].copy() #Contains the current year
        predictions= predict(train, test, predictors, model)
        if predictions is not None and not predictions.empty:
            all_predictions.append(predictions)
    if len(all_predictions) == 0:
        raise ValueError("No predictions generated. Check model training and prediction functions.")
    return pd.concat(all_predictions,ignore_index=True) #Combines all data frames into a single data frame
In [373]:
predictions= backtest(data, model, predictors)
In [374]:
predictions["Predictions"].value_counts() #counts the no' of predicted times, for each type of value
Out[374]:
Predictions
1    109
0     89
Name: count, dtype: int64
In [375]:
data["Target"].value_counts()
Out[375]:
Target
0    129
1    119
Name: count, dtype: int64
In [376]:
precision_score(predictions["Target"], predictions["Predictions"])
Out[376]:
0.41284403669724773
In [377]:
predictions["Target"].value_counts() / predictions.shape[0]
Out[377]:
Target
0    0.535354
1    0.464646
Name: count, dtype: float64
In [441]:
import plotly.express as px

fig = px.line(data, x=data.index, y="Close", title="Interactive Stock Price Chart")
fig.show()
In [398]:
#Adding additional predictors to our model, to improve accuracy 
horizons= [2,5,30,60] #horizons are the rolling means, i.e we will calculate mean of close price in the last 2 days, 5, 30, &60 days, and will find the ratio between today's closing price and closing price in those periods. Performing this to improve predictions
new_predictors= []
data = data.select_dtypes(include=["number"])
for horizon in horizons:
    rolling_averages= data.rolling(horizon).mean()
    ratio_column= f"Close_Ratio_{horizon}" #creating new column
    data[ratio_column]= data["Close"] / rolling_averages["Close"]
    trend_column= f"Trend_{horizon}" #This column holds the no' of days in the past X days(i.e days in horizon) that the stock price went up
    data[trend_column]= data.shift(1).rolling(horizon).sum()["Target"] #This will calculate the sum of the 1's in target, i.e the trend when stock went up and has predicted corectly
    new_predictors+= [ratio_column, trend_column]
In [399]:
data
Out[399]:
Open High Low Close Adj Close Volume Tomorrow Target Close_Ratio_2 Trend_2 Close_Ratio_5 Trend_5 Close_Ratio_30 Trend_30 Close_Ratio_60 Trend_60
0 150.639999 155.229996 150.639999 154.649994 154.414230 83322600 151.919998 0 NaN NaN NaN NaN NaN NaN NaN NaN
1 153.880005 154.580002 151.169998 151.919998 151.688400 64120100 150.869995 0 0.991095 NaN NaN NaN NaN NaN NaN NaN
2 153.779999 154.330002 150.419998 150.869995 150.639999 56007100 151.009995 1 0.996532 0.0 NaN NaN NaN NaN NaN NaN
3 149.460007 151.339996 149.220001 151.009995 151.009995 57450700 153.850006 1 1.000464 1.0 NaN NaN NaN NaN NaN NaN
4 150.949997 154.259995 150.919998 153.850006 153.850006 62199000 153.199997 0 1.009316 2.0 1.009117 NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
243 107.720001 108.680000 107.500000 107.709999 107.709999 20926300 105.980003 0 0.997638 0.0 1.009731 1.0 1.021180 13.0 1.000531 29.0
244 107.660004 107.730003 104.500000 105.980003 105.980003 20343100 106.120003 1 0.991904 0.0 0.990967 1.0 1.003494 13.0 1.018325 28.0
245 106.220001 108.129997 105.620003 106.120003 106.120003 17116300 105.209999 0 1.000660 1.0 0.989187 2.0 1.004731 13.0 1.056318 28.0
246 106.160004 106.300003 104.699997 105.209999 105.209999 19780600 106.214996 1 0.995694 1.0 0.986516 1.0 0.995804 13.0 1.047752 28.0
247 105.320000 106.440002 104.738998 106.214996 106.214996 20705300 NaN 0 1.004753 1.0 0.999699 2.0 1.005330 13.0 1.056670 29.0

248 rows × 16 columns

In [400]:
#Improving the model

model= RandomForestClassifier(n_estimators=100, min_samples_split=50, random_state=1, )
In [401]:
data
Out[401]:
Open High Low Close Adj Close Volume Tomorrow Target Close_Ratio_2 Trend_2 Close_Ratio_5 Trend_5 Close_Ratio_30 Trend_30 Close_Ratio_60 Trend_60
0 150.639999 155.229996 150.639999 154.649994 154.414230 83322600 151.919998 0 NaN NaN NaN NaN NaN NaN NaN NaN
1 153.880005 154.580002 151.169998 151.919998 151.688400 64120100 150.869995 0 0.991095 NaN NaN NaN NaN NaN NaN NaN
2 153.779999 154.330002 150.419998 150.869995 150.639999 56007100 151.009995 1 0.996532 0.0 NaN NaN NaN NaN NaN NaN
3 149.460007 151.339996 149.220001 151.009995 151.009995 57450700 153.850006 1 1.000464 1.0 NaN NaN NaN NaN NaN NaN
4 150.949997 154.259995 150.919998 153.850006 153.850006 62199000 153.199997 0 1.009316 2.0 1.009117 NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
243 107.720001 108.680000 107.500000 107.709999 107.709999 20926300 105.980003 0 0.997638 0.0 1.009731 1.0 1.021180 13.0 1.000531 29.0
244 107.660004 107.730003 104.500000 105.980003 105.980003 20343100 106.120003 1 0.991904 0.0 0.990967 1.0 1.003494 13.0 1.018325 28.0
245 106.220001 108.129997 105.620003 106.120003 106.120003 17116300 105.209999 0 1.000660 1.0 0.989187 2.0 1.004731 13.0 1.056318 28.0
246 106.160004 106.300003 104.699997 105.209999 105.209999 19780600 106.214996 1 0.995694 1.0 0.986516 1.0 0.995804 13.0 1.047752 28.0
247 105.320000 106.440002 104.738998 106.214996 106.214996 20705300 NaN 0 1.004753 1.0 0.999699 2.0 1.005330 13.0 1.056670 29.0

248 rows × 16 columns

In [402]:
import numpy as np
def predict(train, test, predictors, model):
    train = train.copy()
    test = test.copy()
    # Fill missing values with column means
    train[predictors] = train[predictors].apply(lambda x: x.fillna(x.mean()))
    test[predictors] = test[predictors].apply(lambda x: x.fillna(x.mean()))

    model.fit(train[predictors], train["Target"])
    preds= model.predict_proba(test[predictors])[:,1] #This returns the probability of the stock price that goes up or down, and selecting the 2nd column which will give the probability that stock price goes up
    # Apply custom threshold (0.6)
    preds = np.where(preds >= 0.6, 1, 0)  # 1 if >= 0.6, else 0
    #preds[preds >= .6] = 1  #Setting our custom threshold, by default it is "0.5".so if it greater than .6 there is a chance that price will go up.
    #pred[ preds < .6] = 0
    return pd.DataFrame({"Predictions": preds}, index=test.index)
    #preds= pd.Series(preds, index=tesst.index, name="Predictions")
    #combined=pd.concat([test["Target"], preds], axis=1)
    #return combined
In [403]:
print("Missing values in train set:\n", train[predictors].isna().sum())
print("Missing values in test set:\n", test[predictors].isna().sum())
Missing values in train set:
 Open      0
High      0
Low       0
Close     0
Volume    0
dtype: int64
Missing values in test set:
 Open      0
High      0
Low       0
Close     0
Volume    0
dtype: int64
In [404]:
train.shape
Out[404]:
(148, 10)
In [406]:
test.shape
Out[406]:
(100, 10)
In [412]:
pd.Series(new_predictors).value_counts()
Out[412]:
Close_Ratio_2     1
Trend_2           1
Close_Ratio_5     1
Trend_5           1
Close_Ratio_30    1
Trend_30          1
Close_Ratio_60    1
Trend_60          1
Name: count, dtype: int64
In [405]:
data
Out[405]:
Open High Low Close Adj Close Volume Tomorrow Target Close_Ratio_2 Trend_2 Close_Ratio_5 Trend_5 Close_Ratio_30 Trend_30 Close_Ratio_60 Trend_60
0 150.639999 155.229996 150.639999 154.649994 154.414230 83322600 151.919998 0 NaN NaN NaN NaN NaN NaN NaN NaN
1 153.880005 154.580002 151.169998 151.919998 151.688400 64120100 150.869995 0 0.991095 NaN NaN NaN NaN NaN NaN NaN
2 153.779999 154.330002 150.419998 150.869995 150.639999 56007100 151.009995 1 0.996532 0.0 NaN NaN NaN NaN NaN NaN
3 149.460007 151.339996 149.220001 151.009995 151.009995 57450700 153.850006 1 1.000464 1.0 NaN NaN NaN NaN NaN NaN
4 150.949997 154.259995 150.919998 153.850006 153.850006 62199000 153.199997 0 1.009316 2.0 1.009117 NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
243 107.720001 108.680000 107.500000 107.709999 107.709999 20926300 105.980003 0 0.997638 0.0 1.009731 1.0 1.021180 13.0 1.000531 29.0
244 107.660004 107.730003 104.500000 105.980003 105.980003 20343100 106.120003 1 0.991904 0.0 0.990967 1.0 1.003494 13.0 1.018325 28.0
245 106.220001 108.129997 105.620003 106.120003 106.120003 17116300 105.209999 0 1.000660 1.0 0.989187 2.0 1.004731 13.0 1.056318 28.0
246 106.160004 106.300003 104.699997 105.209999 105.209999 19780600 106.214996 1 0.995694 1.0 0.986516 1.0 0.995804 13.0 1.047752 28.0
247 105.320000 106.440002 104.738998 106.214996 106.214996 20705300 NaN 0 1.004753 1.0 0.999699 2.0 1.005330 13.0 1.056670 29.0

248 rows × 16 columns

In [428]:
predictions["Predictions"].value_counts()
Out[428]:
Predictions
0    149
1     35
Name: count, dtype: int64
In [430]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
plt.plot(data["Close"], label="Closing Price", color="blue")
plt.xlabel("Date")
plt.ylabel("Stock Price")
plt.title("Stock Price Trend Over Time")
plt.legend()
plt.show()
No description has been provided for this image
In [431]:
data["50_MA"] = data["Close"].rolling(window=50).mean()  # 50-day moving average
data["200_MA"] = data["Close"].rolling(window=200).mean()

plt.figure(figsize=(12, 6))
plt.plot(data["Close"], label="Closing Price", color="blue")
plt.plot(data["50_MA"], label="50-Day MA", color="orange")
plt.plot(data["200_MA"], label="200-Day MA", color="red")
plt.xlabel("Date")
plt.ylabel("Stock Price")
plt.title("Stock Price with Moving Averages")
plt.legend()
plt.show()
#Identifies trends, bullish/bearish crossovers, and long-term patterns.
No description has been provided for this image
In [ ]: